notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os



In [2]:

    
# Data: https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

# Input Features: ['preg_count', 'glucose_concentration', 'diastolic_bp',
#       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
#       'diabetes_pedi', 'age']

# Target: 'diabetes_class'. 1 => Diabetic. 0 => Normal.

# Objective: Predict probability of diabetes

# Actual Positives: 268 (diabetic)
# Actual Negatives: 500 (normal)

# Diabetes Dataset Size: 768 samples

# Training + Eval set: 710 samples

# Test set: 58 samples



In [3]:

    
data_path = r'..\Data\ClassExamples\DiabetesData'



In [4]:

    
df = pd.read_csv(os.path.join(data_path, 'pima-indians-diabetes.data.txt'))



In [5]:

    
df.columns









    Out[5]:





Index(['preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age', 'diabetes_class'],
      dtype='object')



In [6]:

    
df.shape









    Out[6]:





(768, 9)



In [7]:

    
df.head()









    Out[7]:







  
    
      
      preg_count
      glucose_concentration
      diastolic_bp
      triceps_skin_fold_thickness
      two_hr_serum_insulin
      bmi
      diabetes_pedi
      age
      diabetes_class
    
  
  
    
      0
      6
      148
      72
      35
      0
      33.6
      0.627
      50
      1
    
    
      1
      1
      85
      66
      29
      0
      26.6
      0.351
      31
      0
    
    
      2
      8
      183
      64
      0
      0
      23.3
      0.672
      32
      1
    
    
      3
      1
      89
      66
      23
      94
      28.1
      0.167
      21
      0
    
    
      4
      0
      137
      40
      35
      168
      43.1
      2.288
      33
      1



In [8]:

    
df.diabetes_class.value_counts()









    Out[8]:





0    500
1    268
Name: diabetes_class, dtype: int64



In [9]:

    
df.corr()









    Out[9]:







  
    
      
      preg_count
      glucose_concentration
      diastolic_bp
      triceps_skin_fold_thickness
      two_hr_serum_insulin
      bmi
      diabetes_pedi
      age
      diabetes_class
    
  
  
    
      preg_count
      1.000000
      0.129459
      0.141282
      -0.081672
      -0.073535
      0.017683
      -0.033523
      0.544341
      0.221898
    
    
      glucose_concentration
      0.129459
      1.000000
      0.152590
      0.057328
      0.331357
      0.221071
      0.137337
      0.263514
      0.466581
    
    
      diastolic_bp
      0.141282
      0.152590
      1.000000
      0.207371
      0.088933
      0.281805
      0.041265
      0.239528
      0.065068
    
    
      triceps_skin_fold_thickness
      -0.081672
      0.057328
      0.207371
      1.000000
      0.436783
      0.392573
      0.183928
      -0.113970
      0.074752
    
    
      two_hr_serum_insulin
      -0.073535
      0.331357
      0.088933
      0.436783
      1.000000
      0.197859
      0.185071
      -0.042163
      0.130548
    
    
      bmi
      0.017683
      0.221071
      0.281805
      0.392573
      0.197859
      1.000000
      0.140647
      0.036242
      0.292695
    
    
      diabetes_pedi
      -0.033523
      0.137337
      0.041265
      0.183928
      0.185071
      0.140647
      1.000000
      0.033561
      0.173844
    
    
      age
      0.544341
      0.263514
      0.239528
      -0.113970
      -0.042163
      0.036242
      0.033561
      1.000000
      0.238356
    
    
      diabetes_class
      0.221898
      0.466581
      0.065068
      0.074752
      0.130548
      0.292695
      0.173844
      0.238356
      1.000000



In [10]:

    
temp_diabetic = df.diabetes_class == 1



In [11]:

    
# Diabetic glucose concentration histogram
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].glucose_concentration)
plt.title('diabetic - glucose')
plt.xlabel('Glucose Level')
plt.ylabel('Count')









    Out[11]:





<matplotlib.text.Text at 0x164922ec828>



In [12]:

    
# Diabetic glucose concentration histogram
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].glucose_concentration)
plt.title('normal - glucose')
plt.xlabel('Glucose Level')
plt.ylabel('Count')









    Out[12]:





<matplotlib.text.Text at 0x16492729748>



In [13]:

    
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].bmi)
plt.xlabel('bmi')
plt.ylabel('count')
plt.title('diabetic - bmi')









    Out[13]:





<matplotlib.text.Text at 0x164924c4be0>



In [14]:

    
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].bmi)
plt.xlabel('bmi')
plt.ylabel('count')
plt.title('normal - bmi')









    Out[14]:





<matplotlib.text.Text at 0x1649258e780>



In [15]:

    
fit = plt.figure(figsize = (12, 8))
plt.hist(df[temp_diabetic].age)
plt.xlabel('age')
plt.ylabel('count')
plt.title('diabetic - age')









    Out[15]:





<matplotlib.text.Text at 0x16492978dd8>



In [16]:

    
fit = plt.figure(figsize = (12, 8))
plt.hist(df[~temp_diabetic].age)
plt.xlabel('age')
plt.ylabel('count')
plt.title('normal - age')









    Out[16]:





<matplotlib.text.Text at 0x16492a39c88>



In [17]:

    
fit = plt.figure(figsize = (12, 8))
plt.hist([df[temp_diabetic].age,
          df[~temp_diabetic].age],
        label = ['diab','normal'])
plt.xlabel('Age')
plt.ylabel('count')
plt.title('Age')
plt.legend()









    Out[17]:





<matplotlib.legend.Legend at 0x16492a8d9e8>



In [18]:

    
df_train_eval = df.iloc[range(0, 710)]
df_test_eval = df.iloc[range(710, 768)]



In [19]:

    
df_train_eval.to_csv(os.path.join(data_path, 'diabetes_data_train_710samples.csv'),
                     index = True,
                     index_label = 'Row')



In [20]:

    
df_test_eval.to_csv(os.path.join(data_path, 'diabetes_data_eval_58samples.csv'),
                    index = True,
                    index_label = 'Row')



In [21]:

    
# export all columns except for target attribute
df.to_csv(os.path.join(data_path, 'diabetest_data_test_all.csv'),
          index = True,
          index_label = 'Row',
          columns = df.columns[:-1])



In [22]:

    
# Predicted Output
df_predicted = pd.read_csv(os.path.join(data_path,
                                        'output',
                                        'bp-dHhriWXAJNj-diabetest_data_test_all.csv.gz'))



In [23]:

    
df_predicted.head()



In [24]:

    
diab_table = pd.crosstab(
    df.diabetes_class, 
    df_predicted.bestAnswer, 
    rownames = ['Actual'], 
    colnames=['Predicted'])



In [25]:

    
# Important to build a confusion matrix or contingency matrix
# Shows how many were correctly classified and how many mis-classification occured
# Helpful when positive samples are small in number
diab_table



In [26]:

    
fit = plt.figure(figsize = (12, 8))
plt.bar([0,1], 
        diab_table.iloc[0],
        width = .35, 
        label = 'Predicted Normal', 
        color ='g')
plt.bar([0,1], 
        diab_table.iloc[1], 
        width = .35, 
        color = 'b', 
        label = 'Predicted Diabetic', 
        bottom = diab_table.ix[0])
plt.ylabel('Predicted')
plt.xticks([0.2, 1.2], ('Actual Normal', 'Actual Diabetic'))
plt.grid()
plt.legend()









    



C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:12: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if sys.path[0] == '':






    Out[26]:





<matplotlib.legend.Legend at 0x164931adc50>

AWS ML Evaluation Metric

Area Under Curve

AUC is a curve formed by plotting True Positive Rate against False Positive Rate at different cut-off threshholds.

AUC metric closer to 1 indicates highly accurate prediction

AUC metric 0.5 indicates random guess - Baseline AUC

AUC metric closer to 0 indicates model has learned from the features, but predictions are flipped

</ul>

AUC for Diabetes model: 0.87 (evaluation set 1)
AUC for Diabetes model: 0.88 (evaluation set 2)

http://docs.aws.amazon.com/machine-learning/latest/dg/binary-model-insights.html?icmpid=docs_machinelearning_console



In [27]:

    
# Example to show how to compute metrics.
# AWS ML Provides all these metrics under evaluation
actual_negative = df.diabetes_class.value_counts()[0]
actual_positive = df.diabetes_class.value_counts()[1]
actual_count = actual_negative + actual_positive



In [28]:

    
actual_negative, actual_positive









    Out[28]:





(500, 268)



In [29]:

    
true_negative = diab_table.iloc[0][0]
false_positive = diab_table.iloc[0][1]
true_positive = diab_table.iloc[1][1]
false_negative = diab_table.iloc[1][0]



In [30]:

    
diab_table



In [31]:

    
true_negative, false_positive









    Out[31]:





(442, 58)



In [32]:

    
true_positive, false_negative









    Out[32]:





(156, 112)



In [33]:

    
# Accuracy - larger value indicates better predictive accuracy
# How many were correctly classified?
accuracy = (true_negative + true_positive) / actual_count
print('Accuracy = {0:3.2f}'.format(accuracy))









    



Accuracy = 0.78



In [34]:

    
# True Positive Rate (also known as Recall) - larger value indicates better predictive accuracy
# Out of all positive, how many were correctly predicted as positive
tpr = true_positive / actual_positive
print('Probability of detection. TPR = {0:3.2f}'.format(tpr))









    



Probability of detection. TPR = 0.58



In [35]:

    
# False Positive Rate - smaller value indicates better predictive accuracy
# Out of all negatives, how many were incorrectly predicted as positive
fpr = false_positive / actual_negative
print('Probability of false alarm. FPR = {0:3.2f}'.format(fpr))









    



Probability of false alarm. FPR = 0.12



In [36]:

    
# Precision - out of all predicted as positive, how many are true positive?
# Larger value indicates better predictive accuracy
precision = true_positive / (true_positive + false_positive)
print('Precision = {0:3.2f}'.format(precision))









    



Precision = 0.73

Summary

For Binary Classification, Area Under Curve (AUC) is the evaluation metric to assess the quality of model.

AUC is the area of a curve formed by plotting True Positive Rate against False Positive Rate at different cut-off thresholds.

AUC metric closer to 1 indicates highly accurate prediction
AUC metric 0.5 indicates random guess - Baseline AUC
AUC metric closer to 0 indicates model has learned from the features, but predictions are flipped

Advanced Metrics

Accuracy - Fraction of correct predictions. Larger value indicates better predictive accuracy
True Positive Rate - Probability of detection. Out of all positive, how many were correctly predicted as positive. Larger value indicates better predictive accuracy
False Positive Rate - Probability of false alarm. Smaller value indicates better predictive accuracy. Out of all negatives, how many were incorrectly predicted as positive.
Precision - out of all predicted as positive, how many are true positive? Larger value indicates better predictive accuracy

	tag	bestAnswer	score
0	0	1	0.644611
1	1	0	0.088800
2	2	1	0.712889
3	3	0	0.084839
4	4	1	0.853618

	preg_count	glucose_concentration	diastolic_bp	triceps_skin_fold_thickness	two_hr_serum_insulin	bmi	diabetes_pedi	age	diabetes_class
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	preg_count	glucose_concentration	diastolic_bp	triceps_skin_fold_thickness	two_hr_serum_insulin	bmi	diabetes_pedi	age	diabetes_class
preg_count	1.000000	0.129459	0.141282	-0.081672	-0.073535	0.017683	-0.033523	0.544341	0.221898
glucose_concentration	0.129459	1.000000	0.152590	0.057328	0.331357	0.221071	0.137337	0.263514	0.466581
diastolic_bp	0.141282	0.152590	1.000000	0.207371	0.088933	0.281805	0.041265	0.239528	0.065068
triceps_skin_fold_thickness	-0.081672	0.057328	0.207371	1.000000	0.436783	0.392573	0.183928	-0.113970	0.074752
two_hr_serum_insulin	-0.073535	0.331357	0.088933	0.436783	1.000000	0.197859	0.185071	-0.042163	0.130548
bmi	0.017683	0.221071	0.281805	0.392573	0.197859	1.000000	0.140647	0.036242	0.292695
diabetes_pedi	-0.033523	0.137337	0.041265	0.183928	0.185071	0.140647	1.000000	0.033561	0.173844
age	0.544341	0.263514	0.239528	-0.113970	-0.042163	0.036242	0.033561	1.000000	0.238356
diabetes_class	0.221898	0.466581	0.065068	0.074752	0.130548	0.292695	0.173844	0.238356	1.000000